pRactice corner: Tidy Tuesday Series

lruolin

Load Packages

library(tidyverse)
library(tidytuesdayR)
library(ggthemes)
library(mapproj)

Load Data from tidytuesdayR package

# to download data
tt_data <- tt_load(2018, week = 32)


# to view readme
readme(tt_data)

recent_grads <- tt_data$us_wind

Explore Dataset

glimpse(us_wind)

Rows: 58,185
Columns: 24
$ case_id    <dbl> 3073429, 3071522, 3073425, 3071569, 3005252, 3003…
$ faa_ors    <chr> "missing", "missing", "missing", "missing", "miss…
$ faa_asn    <chr> "missing", "missing", "missing", "missing", "miss…
$ usgs_pr_id <dbl> 4960, 4997, 4957, 5023, 5768, 5836, 4948, 5828, 4…
$ t_state    <chr> "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "…
$ t_county   <chr> "Kern County", "Kern County", "Kern County", "Ker…
$ t_fips     <chr> "06029", "06029", "06029", "06029", "06029", "060…
$ p_name     <chr> "251 Wind", "251 Wind", "251 Wind", "251 Wind", "…
$ p_year     <dbl> 1987, 1987, 1987, 1987, 1987, 1987, 1987, 1987, 1…
$ p_tnum     <dbl> 194, 194, 194, 194, 194, 194, 194, 194, 194, 194,…
$ p_cap      <dbl> 18.43, 18.43, 18.43, 18.43, 18.43, 18.43, 18.43, …
$ t_manu     <chr> "Vestas", "Vestas", "Vestas", "Vestas", "Vestas",…
$ t_model    <chr> "missing", "missing", "missing", "missing", "miss…
$ t_cap      <dbl> 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 9…
$ t_hh       <dbl> -9999, -9999, -9999, -9999, -9999, -9999, -9999, …
$ t_rd       <dbl> -9999, -9999, -9999, -9999, -9999, -9999, -9999, …
$ t_rsa      <dbl> -9999, -9999, -9999, -9999, -9999, -9999, -9999, …
$ t_ttlh     <dbl> -9999, -9999, -9999, -9999, -9999, -9999, -9999, …
$ t_conf_atr <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ t_conf_loc <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
$ t_img_date <chr> "1/1/2012", "1/1/2012", "1/1/2012", "7/31/2016", …
$ t_img_srce <chr> "NAIP", "NAIP", "NAIP", "Digital Globe", "Digital…
$ xlong      <dbl> -118.3607, -118.3612, -118.3604, -118.3640, -118.…
$ ylat       <dbl> 35.08378, 35.08151, 35.08471, 35.07942, 35.08559,…

Count state

us_wind %>% 
  count(t_state, sort = T)

# A tibble: 45 x 2
   t_state     n
   <chr>   <int>
 1 TX      13232
 2 CA       9037
 3 IA       4280
 4 OK       3821
 5 KS       2898
 6 IL       2602
 7 MN       2547
 8 CO       2278
 9 OR       1868
10 WA       1744
# … with 35 more rows

Count project names

us_wind %>% 
  count(p_name, sort = T)

# A tibble: 1,479 x 2
   p_name                                     n
   <chr>                                  <int>
 1 unknown Tehachapi Wind Resource Area 1  1831
 2 Green Ridge Power                        516
 3 Stateline Wind Project                   440
 4 Mesa Wind Farm                           432
 5 Sky River                                335
 6 Cedar Creek                              274
 7 Peetz Table                              267
 8 Flat Ridge 2                             261
 9 Rolling Hills                            259
10 Woodward Mountain I & II                 242
# … with 1,469 more rows

Plot longitude and latitude

us_wind %>% 
  filter(xlong<100) %>% # filter out outlier
  ggplot(aes(xlong, ylat)) +
  geom_point() +
  borders("state") +
  coord_map() +
  theme_void()

Distribution of wind turbines in US States

count_states <- us_wind$t_state %>% factor() %>% fct_count() 

us_wind %>% 
  filter(!t_state %in% c("AK", "HI", "GU", "PR")) %>% # Exclude Alaska, Hawaii, Guam, Puerto Rico
  ggplot(aes(xlong, ylat)) +
  geom_point() +
  borders("state") +
  coord_map() +
  labs(title = "Distribution of wind turbines in US",
       subtitle = "Most wind turbines are situated along middle of US.",
       caption = "Source: USGS.gov") +
  theme_void()

Projects

us_wind_raw <- us_wind

us_wind_processed <- us_wind %>%
  filter(!t_state %in% c("AK", "HI", "GU", "PR")) %>% 
  na_if(-9999) # replace -9999 as na

wind_projects <- us_wind_processed %>% 
  group_by(p_name, t_state) %>% 
  summarise(turbines = n(),
            long = mean(xlong),
            lat = mean(ylat),
            long_sd = sd(xlong),
            lat_sd = sd(ylat))

wind_projects %>% 
  ggplot(aes(long, lat, col = turbines, size = turbines)) +
  geom_point(aes(size = turbines), show.legend = T) +
  scale_color_continuous(type = "viridis") +
  borders("state") +
  coord_map() +
  labs(title = "Distribution of projects in US",
       subtitle = "The bigest project is 251 Wind, in California",
       caption = "Source: usgs.gov") +
  theme_void()

To find out what is the biggest project:

us_wind_processed %>% 
  count(p_name, t_state)

# A tibble: 1,440 x 3
   p_name                      t_state     n
   <chr>                       <chr>   <int>
 1 251 Wind                    CA        190
 2 30 MW Iowa DG Portfolio     IA         10
 3 6th Space Warning Squadron  MA          2
 4 Adair                       IA         76
 5 Adams                       IA         64
 6 Adams Wind Generations, LLC MN         12
 7 AFCEE MMR Turbines          MA          2
 8 AG Land 1                   IA          1
 9 AG Land 2                   IA          1
10 AG Land 3                   IA          1
# … with 1,430 more rows

Year

wind_projects <- us_wind_processed %>% 
  group_by(p_name, t_state) %>% 
  summarise(year = min(p_year, na.rm = T), # first year project started
            turbines = n(),
            total_capacity_kw = sum(t_cap, na.rm = T),
            lon = mean(xlong),
            lat = mean(ylat),
            lon_sd = sd(xlong),
            lat_sd = sd(ylat))

wind_projects %>% 
  ggplot(aes(year)) +
  geom_histogram(fill = "deepskyblue4") +
  labs(title = "Distribution of projects by year",
       subtitle = "Wind Turbine Projects gained momentum after 2000",
       caption = "Source: usgs.gov") +
  theme_clean()

wind_projects %>% 
  ggplot(aes(lon, lat, size = turbines, col = year))+
  geom_point(aes(size = turbines), show.legend = T) +
  scale_color_continuous(type = "viridis") +
  borders("state") +
  coord_map() +
  labs(title = "Age and Scale of US Wind Turbine Projects",
       subtitle = "251 Wind in CA is the oldest project, and the newer projects are situated along middle of the country",
       caption = "Source: usgs.gov") +
  theme_void()

Capacity

us_wind_processed %>% 
  distinct(p_name, p_cap) %>% # capacity
  count(p_name, sort = T)

# A tibble: 1,425 x 2
   p_name                        n
   <chr>                     <int>
 1 McNeilus                      5
 2 Bishop Hill I                 3
 3 Blue Summit                   3
 4 Capricorn Ridge               3
 5 Capricorn Ridge expansion     3
 6 Case Western University       3
 7 Century Expansion             3
 8 Crossroads                    3
 9 Crow Lake                     3
10 Horse Hollow II               3
# … with 1,415 more rows

us_wind_processed %>% 
  group_by(p_name, t_state) %>% 
  summarise(year = min(p_year, na.rm = T), # first year project started
            turbines = n(),
            total_capacity_kw = sum(t_cap, na.rm = T),
            lon = mean(xlong),
            lat = mean(ylat),
            lon_sd = sd(xlong),
            lat_sd = sd(ylat)) %>% 
  ungroup()

# A tibble: 1,440 x 9
   p_name         t_state  year turbines total_capacity_…    lon   lat
   <chr>          <chr>   <dbl>    <int>            <dbl>  <dbl> <dbl>
 1 251 Wind       CA       1987      190            18050 -118.   35.1
 2 30 MW Iowa DG… IA       2017       10            30000  -93.4  42.0
 3 6th Space War… MA       2013        2             3360  -70.5  41.8
 4 Adair          IA       2008       76           174800  -94.7  41.5
 5 Adams          IA       2016       64           154284  -94.7  40.9
 6 Adams Wind Ge… MN       2011       12            20040  -94.7  44.9
 7 AFCEE MMR Tur… MA       2011        2             3000  -70.5  41.8
 8 AG Land 1      IA       2012        1             1600  -93.3  42.2
 9 AG Land 2      IA       2012        1             1600  -93.4  42.1
10 AG Land 3      IA       2012        1             1600  -93.4  42.1
# … with 1,430 more rows, and 2 more variables: lon_sd <dbl>,
#   lat_sd <dbl>

How has turbine capacity changed over time?

turbine <- us_wind_processed %>% 
  group_by(p_name, t_state) %>% 
  summarise(year = min(p_year, na.rm = T), # first year project started
            turbines = n(),
            total_capacity_kw = sum(t_cap),
            lon = mean(xlong),
            lat = mean(ylat),
            lon_sd = sd(xlong),
            lat_sd = sd(ylat)) %>% 
  ungroup()


turbine %>% 
  ggplot(aes(year, total_capacity_kw/turbines)) +
  geom_point() +
  geom_smooth(method= "lm") +
  labs(title = "Change in Total Capacity per Turbine over Time",
       subtitle = "Total Capacity per Turbine increased over time",
       caption = "Source: usgs.gov") +
  theme_few()

Turbine models

turbine_models <- us_wind_processed %>% 
  group_by(t_model) %>% 
  summarize(t_cap = median(t_cap), # turbine capacity (kW)
            t_hh = median(t_hh), # turbine hub height (m)
            t_rd = median(t_rd), # turbine rotor diameter (m)
            t_rsw = median(t_rsa), # turbine rotor swept area (m2)
            t_ttlh = median(t_ttlh), # turbine total height calculated (m)
            turbines = n(), # number of turbines
            projects = n_distinct(p_name)) %>%  # number of projects
  arrange(desc(projects))

turbine_models %>% 
  ggplot(aes(t_ttlh, t_cap)) +
  geom_point() +
  labs(title = "Relationship between turbine height and capacity",
       subtitle = "Taller Turbines have higher capacity",
       x = "Turbine Total Height Calculated (m)",
       y = "Turbine Capacity (kW)") +
  theme_clean()

Learning points:

Plotting a map of US using ggplot
Replacing missing data using dplyr::na_if to replace all -9999
Data cleaning should be done at the start

References

https://www.youtube.com/watch?v=O1oDIQV6VKU&list=PL19ev-r1GBwkuyiwnxoHTRC8TTqP8OEi8&index=78

Comment on this article Share:

Tidy Tuesday Series

Load Packages

Load Data from tidytuesdayR package

Explore Dataset

Count state

Count project names

Plot longitude and latitude

Distribution of wind turbines in US States

Projects

Year

Capacity

How has turbine capacity changed over time?

Turbine models

Learning points:

References

Citation